PLOS and OSF

Author
Affiliation
Thomas Pollet and Connor Malcolm

Northumbria University

Published

January 30, 2022

Modified

June 19, 2023

Abstract
PLOS and OSF Data

PLOS Data

Link Scopus/WOS data by DOI to PLOS data

PLOS <- read.csv("PLOS_Dataset.csv")
colnames(PLOS)[3] <- "DI"

just_northumbria_PLOS<- PLOS %>% filter(DI %in% scopus_wos_all_oa$DI) #Only 15 (with a Northumbria Author)
corr_northumbria_PLOS <- PLOS %>% filter(DI %in% all_oa_corr$DI) # Only 9 (with a corresponding author from Northumbria.) Get descriptives for open access

just_plos_attempt <- scopus_wos_all_oa[str_detect(scopus_wos_all_oa$JI, "PLOS"), ]  # 43 journals.
just_ploscorr_attempt <- all_oa_corr[str_detect(all_oa_corr$JI, "PLOS"), ] # 27 journals

#Discrepency between PLOS data and scopus/web of science. Could be due to PLOS being out of date.

OSF

ORCID <- read.csv("PURE_ORCID_EXTRACT.csv")
northumbria_OSF <- read.csv("OSF_names.csv")

change_order <- function(string) {
  words <- strsplit(string, ",")[[1]]
  words <- words[length(words):1]
  paste(words, collapse = " ")
}

ORCID$Name <- sapply(ORCID$Name, change_order)

colnames(northumbria_OSF)[1] <- "Name"
# Issue with slight differnces in names, no complete matches.Fuzzy match is confusing when trying to merge and delete unmatching rows.

ORCID_OSF_join <- stringdist_join(ORCID, northumbria_OSF, 
                by='Name', #match based on team
                mode='left', #use left join
                method = "jw", #use jw distance metric
                max_dist=99, 
                distance_col='dist') %>%
  group_by(Name.x) %>%
  slice_min(order_by=dist, n=2)


ORCID_OSF_join %>%  filter(substr(Name.x, 1, 1) == substr(Name.y, 1, 1) & substr(Name.x, 2, 2) == substr(Name.y, 2, 2) & substr(Name.x, 3, 3) == substr(Name.y, 3, 3))
# A tibble: 0 × 6
# Groups:   Name.x [0]
# ℹ 6 variables: Name.x <chr>, Username <chr>, FTE <dbl>, ORCID <chr>, Name.y <chr>, dist <dbl>
ORCID_OSF_join<- ORCID_OSF_join[ORCID_OSF_join$dist <= .25, ]

###write_csv(ORCID_OSF_join, "ORCID_OSF_join.csv") I manually wnet through the fuzzy matched and got rid of any that clearly were incorrect matches

ORCID_neat_join <- read.csv("ORCID_OSF_join.csv")
ORCID_neat_join$dist <- NULL
ORCID_neat_join$FTE <- NULL
ORCID_neat_join$Name.y <- NULL
#Afilliation (university of Northumbria) in Scopus then click authors then export as csv. Linking OSF to Scopus guys but ORCID from excel. Plus Manual checking for multiple matches.

A_Z <- read.csv("scopus A-Z.csv")
Z_A <- read.csv("scopus Z-A.csv")

#Scopus by affiliation for authors. A-Z 4000 in one and Z-A 4000 in the other to capture all authors.

scopus_authors <- merge(A_Z, Z_A, by = "Auth.ID")

scopus_authors <- subset(scopus_authors, select = c(-Author.Name.y, -Number.of.Documents.y, -Orc_ID.y, -Subject.Area.y, -X.y ))
colnames(scopus_authors)[2] <- "Name"
colnames(scopus_authors)[3] <- "Number_of_Documents"
colnames(scopus_authors)[4] <- "Subject_Area"
colnames(scopus_authors)[5] <- "ORCID"
colnames(scopus_authors)[6] <- "X"

scopus_authors[scopus_authors == ''] <- NA
ORCID_neat_join[ORCID_neat_join == ''] <- NA


scopus_authors <- scopus_authors[!is.na(scopus_authors$ORCID), ]
ORCID_neat_join <- ORCID_neat_join[!is.na(ORCID_neat_join$ORCID), ]

scopus_and_OSF <- left_join(ORCID_neat_join, scopus_authors, by= "ORCID") ##Only 7 on OSF.ORCID and on Scopus.